# 📘 Full Note: Devanagari → Odia XML Transliteration Tool

This guide helps you set up and use a Python-based tool to **convert Devanagari script** in XML files to **Odia script**, including conjuncts and punctuation rules.

---

## 🧰 Features

✅ Converts Devanagari characters to Odia using a base map  
✅ Supports complex conjunct mapping using JSON files  
✅ Cleans up invisible characters like Zero Width Joiner (`\u200D`)  
✅ Fixes punctuation spacing (like `।`, `॥`, `?`, `!`)  
✅ Handles individual XML files or entire directories  
✅ Output files are saved with `.or.xml` extension

---

## 🖥️ 1. Prerequisites

- **Python 3.7+** must be installed

### 🔹 Install Python (if not already installed)

#### Windows:

1. Download from [https://www.python.org/downloads](https://www.python.org/downloads)
2. Run the installer.
3. ✅ **Important**: Check the box for **“Add Python to PATH”**
4. Click **Install Now**

#### Termux:

```bash
pkg install python
```

#### Linux (Debian/Ubuntu):

```bash
sudo apt update
sudo apt install python3 python3-pip
```

---

## 📂 2. Project Structure

```
transliterator/
├── deva-odia.py            # Main script (see below)
├── conjuncts/            # Folder for conjunct JSON files
├── deva/                # Folder containing Devanagari XML files
└── odia/               # Output folder for Odia XML files
```

---

## 🧾 3. Save the Script

Create a file called `convert.py` and paste the **complete script below**:

```python
#!/usr/bin/env python3
import argparse
import os
import json
import re
from pathlib import Path
import xml.etree.ElementTree as ET

# Base Devanagari to Odia character map
BASE_MAP = {
    "अ": "ଅ", "आ": "ଆ", "इ": "ଇ", "ई": "ଈ", "उ": "ଉ", "ऊ": "ଊ",
    "ऋ": "ଋ", "ए": "ଏ", "ऐ": "ଐ", "ओ": "ଓ", "औ": "ଔ",
    "क": "କ", "ख": "ଖ", "ग": "ଗ", "घ": "ଘ", "ङ": "ଙ",
    "च": "ଚ", "छ": "ଛ", "ज": "ଜ", "झ": "ଝ", "ञ": "ଞ",
    "ट": "ଟ", "ठ": "ଠ", "ड": "ଡ", "ढ": "ଢ", "ण": "ଣ",
    "त": "ତ", "थ": "ଥ", "द": "ଦ", "ध": "ଧ", "न": "ନ",
    "प": "ପ", "फ": "ଫ", "ब": "ବ", "भ": "ଭ", "म": "ମ",
    "य": "ୟ", "र": "ର", "ल": "ଲ", "व": "ୱ", "श": "ଶ", "ष": "ଷ", "स": "ସ", "ह": "ହ" ,"ळ":"ଳ",
    "ा": "ା", "ि": "ି", "ी": "ୀ", "ु": "ୁ", "ू": "ୂ", "ृ": "ୃ",
    "े": "େ", "ै": "ୈ", "ो": "ୋ", "ौ": "ୌ", "ं": "ଂ", "ँ": "ଁ", "ः": "ଃ", "्": "୍",
    "०": "୦", "१": "୧", "२": "୨", "३": "୩", "४": "୪", "५": "୫", "६": "୬", "७": "୭", "८": "୮", "९": "୯"
}

def load_conjuncts(conjuncts_dir: str) -> dict:
    conjunct_map = {}
    if not os.path.isdir(conjuncts_dir):
        print(f"⚠️ Warning: No conjuncts directory found at '{conjuncts_dir}', using only base map.")
        return conjunct_map

    for file in Path(conjuncts_dir).glob("*.json"):
        with open(file, encoding="utf-8") as f:
            conjunct_map.update(json.load(f))
    return conjunct_map

def adjust_punctuation(text):
    punctuations = ['।', '॥', r'\?', '!']
    for punc in punctuations:
        text = re.sub(rf'([^ ])({punc})', r'\1 \2', text)
        text = re.sub(rf'(^|\s)({punc})', r'\1\2', text)
    return text

def transliterate_text(text, full_conjunct_map):
    if not text:
        return text
    text = text.replace('\u200D', '')
    for key in sorted(full_conjunct_map, key=len, reverse=True):
        text = text.replace(key, full_conjunct_map[key])
    transliterated = ''.join(BASE_MAP.get(ch, ch) for ch in text)
    return adjust_punctuation(transliterated)

def transliterate_element(element, conjunct_map: dict):
    if element.text:
        element.text = transliterate_text(element.text, conjunct_map)
    for child in element:
        transliterate_element(child, conjunct_map)
    if element.tail:
        element.tail = transliterate_text(element.tail, conjunct_map)

def remove_self_closing_tags(xml_text: str) -> str:
    # Replace <tag /> with <tag></tag>
    return re.sub(r"<(\w+)([^>]*)\s*/>", r"<\1\2></\1>", xml_text)

def transliterate_xml_file(input_path: Path, output_path: Path, conjunct_map: dict):
    with open(input_path, 'r', encoding='utf-16') as f:
        content = f.read()

    prolog_lines = []
    rest = content.lstrip()

    while rest.startswith('<?'):
        end = rest.find('?>') + 2
        line = rest[:end].strip()
        if line.startswith('<?xml-stylesheet'):
            line = '<?xml-stylesheet type="text/xsl" href="tipitaka-odia.xsl"?>'
        prolog_lines.append(line)
        rest = rest[end:].lstrip()

    try:
        root = ET.fromstring(rest)
    except ET.ParseError as e:
        print(f"✖ Parse error in {input_path}: {e}")
        return

    transliterate_element(root, conjunct_map)

    xml_body = ET.tostring(root, encoding='unicode')
    xml_body = remove_self_closing_tags(xml_body)

    final_output = '\n'.join(prolog_lines) + '\n' + xml_body

    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w', encoding='utf-16') as f:
        f.write(final_output)

    print(f"✓ Converted: {input_path} → {output_path}")

def process_input(input_path: str, output_dir: str, conjuncts_dir: str):
    conjunct_map = load_conjuncts(conjuncts_dir)
    input_path = Path(input_path)
    output_dir = Path(output_dir)

    if input_path.is_file() and input_path.suffix == ".xml":
        output_file = output_dir / f"{input_path.stem}.or.xml"
        transliterate_xml_file(input_path, output_file, conjunct_map)

    elif input_path.is_dir():
        for xml_file in input_path.glob("**/*.xml"):
            rel_path = xml_file.relative_to(input_path)
            output_file = output_dir / rel_path
            output_file = output_file.with_suffix(".or.xml")
            transliterate_xml_file(xml_file, output_file, conjunct_map)

    else:
        print("❌ Error: Input must be a .xml file or a directory containing XML files.")

def main():
    parser = argparse.ArgumentParser(description="Devanagari → Odia XML Converter")
    parser.add_argument("input", help="Path to input XML file or directory")
    parser.add_argument("-o", "--output-dir", default="converted", help="Output directory (default: converted/)")
    parser.add_argument("-c", "--conjuncts", default="conjuncts", help="Path to conjuncts directory (default: conjuncts/)")
    args = parser.parse_args()

    process_input(args.input, args.output_dir, args.conjuncts)

if __name__ == "__main__":
    main()
```

---

## 🧪 4. How to Run

### 🔹 To convert one file:

```bash
python deva-odia.py deva/file-name.xml -o odia/
```

### 🔹 To convert an entire folder:

```bash
python deva-odia.py deva/ -o odia/
```
---

## ✅ Output

- Output files will be saved as `.or.xml` under the specified output folder.
- Folder structure is preserved if you're processing a directory.
